# suppress warnings
options(warn=-1)
library(ggplot2)
suppressPackageStartupMessages(library(viridis))
library(plyr)
library(reshape2)
library(readr)
library(yaml)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
## The following object is masked from 'package:viridis':
##
## viridis_pal
library(parallel)
library(futile.logger)
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(optparse))
# option parsing #
# "Input file or directory (required). Either a full path to a sequence_summary.txt file, or a full path to a directory containing one or more such files. In the latter case the directory is searched recursively."
# "Output directory (required). If a single sequencing_summary.txt file is passed as input, then the output directory will contain just the plots associated with that file. If a directory containing more than one sequencing_summary.txt files is passed as input, then the plots will be put into sub-directories that have the same names as the parent directories of each sequencing_summary.txt file"
# "The cutoff value for the mean Q score of a read (default 7). Used to create separate plots for reads above and below this threshold"
#"Number of processors to use for the anlaysis (default 1). Only helps when you are analysing more than one sequencing_summary.txt file at a time"
input.file <- params$input.file
output.dir <- params$output.dir
q <- params$q
cores <- params$q
q_title = paste("Q>=", q, sep="")
# build the map for R9.5
p1 = data.frame(channel=33:64, row=rep(1:4, each=8), col=rep(1:8, 4))
p2 = data.frame(channel=481:512, row=rep(5:8, each=8), col=rep(1:8, 4))
p3 = data.frame(channel=417:448, row=rep(9:12, each=8), col=rep(1:8, 4))
p4 = data.frame(channel=353:384, row=rep(13:16, each=8), col=rep(1:8, 4))
p5 = data.frame(channel=289:320, row=rep(17:20, each=8), col=rep(1:8, 4))
p6 = data.frame(channel=225:256, row=rep(21:24, each=8), col=rep(1:8, 4))
p7 = data.frame(channel=161:192, row=rep(25:28, each=8), col=rep(1:8, 4))
p8 = data.frame(channel=97:128, row=rep(29:32, each=8), col=rep(1:8, 4))
q1 = data.frame(channel=1:32, row=rep(1:4, each=8), col=rep(16:9, 4))
q2 = data.frame(channel=449:480, row=rep(5:8, each=8), col=rep(16:9, 4))
q3 = data.frame(channel=385:416, row=rep(9:12, each=8), col=rep(16:9, 4))
q4 = data.frame(channel=321:352, row=rep(13:16, each=8), col=rep(16:9, 4))
q5 = data.frame(channel=257:288, row=rep(17:20, each=8), col=rep(16:9, 4))
q6 = data.frame(channel=193:224, row=rep(21:24, each=8), col=rep(16:9, 4))
q7 = data.frame(channel=129:160, row=rep(25:28, each=8), col=rep(16:9, 4))
q8 = data.frame(channel=65:96, row=rep(29:32, each=8), col=rep(16:9, 4))
map = rbind(p1, p2, p3, p4, p5, p6, p7, p8, q1, q2, q3, q4, q5, q6, q7, q8)
add_cols <- function(d, min.q){
# take a sequencing sumamry file (d), and a minimum Q value you are interested in (min.q)
# return the same data frame with the following columns added
# cumulative.bases
# hour of run
# reads.per.hour
d = subset(d, mean_qscore_template >= min.q)
if(nrow(d)==0){
flog.error(paste("There are no reads with a mean Q score higher than your cutoff of ", min.q, ". Please choose a lower cutoff and try again.", sep = ""))
quit()
}
d = merge(d, map, by="channel")
d = d[with(d, order(-sequence_length_template)), ] # sort by read length
d$cumulative.bases = cumsum(as.numeric(d$sequence_length_template))
d$hour = d$start_time %/% 3600
# add the reads generated for each hour
reads.per.hour = as.data.frame(table(d$hour))
names(reads.per.hour) = c("hour", "reads_per_hour")
reads.per.hour$hour = as.numeric(as.character(reads.per.hour$hour))
d = merge(d, reads.per.hour, by = c("hour"))
return(d)
}
load_summary <- function(filepath, min.q){
# load a sequencing summary and add some info
# min.q is a vector of length 2 defining 2 levels of min.q to have
# by default the lowest value is -Inf, i.e. includes all reads. The
# other value in min.q is set by the user at the command line
d = read_tsv(filepath, col_types = cols_only(channel = 'i',
num_events_template = 'i',
sequence_length_template = 'i',
mean_qscore_template = 'n',
sequence_length_2d = 'i',
mean_qscore_2d = 'n',
start_time = 'n'))
if("sequence_length_2d" %in% names(d)){
# it's a 1D2 or 2D run
d$sequence_length_template = as.numeric(as.character(d$sequence_length_2d))
d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_2d))
d$num_events_template = NA
d$start_time = as.numeric(as.character(d$start_time))
}else{
d$sequence_length_template = as.numeric(as.character(d$sequence_length_template))
d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_template))
d$num_events_template = as.numeric(as.character(d$num_events_template))
d$start_time = as.numeric(as.character(d$start_time))
}
d$events_per_base = d$num_events_template/d$sequence_length_template
flowcell = basename(dirname(filepath))
# add columns for all the reads
d1 = add_cols(d, min.q[1])
d1$Q_cutoff = "All reads"
# add columns for just the reads that pass the user Q threshold
d2 = add_cols(d, min.q[2])
d2$Q_cutoff = q_title
# bind those two together into one data frame
d = as.data.frame(rbindlist(list(d1, d2)))
# name the flowcell (useful for analyses with >1 flowcell)
d$flowcell = flowcell
# make sure this is a factor
d$Q_cutoff = as.factor(d$Q_cutoff)
keep = c("hour","start_time", "channel", "sequence_length_template", "mean_qscore_template", "row", "col", "cumulative.bases", "reads_per_hour", "Q_cutoff", "flowcell", "events_per_base")
d = d[keep]
return(d)
}
reads.gt <- function(d, len){
# return the number of reads in data frame d
# that are at least as long as length len
return(length(which(d$sequence_length_template>=len)))
}
bases.gt <- function(d, len){
# return the number of bases contained in reads from
# data frame d
# that are at least as long as length len
reads = subset(d, sequence_length_template >= len)
return(sum(as.numeric(reads$sequence_length_template)))
}
log10_minor_break = function (...){
# function to add minor breaks to a log10 graph
# hat-tip: https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot
function(x) {
minx = floor(min(log10(x), na.rm=T))-1;
maxx = ceiling(max(log10(x), na.rm=T))+1;
n_major = maxx-minx+1;
major_breaks = seq(minx, maxx, by=1)
minor_breaks =
rep(log10(seq(1, 9, by=1)), times = n_major)+
rep(major_breaks, each = 9)
return(10^(minor_breaks))
}
}
binSearch <- function(min, max, df, t = 100000) {
# binary search algorithm, thanks to https://stackoverflow.com/questions/46292438/optimising-a-calculation-on-every-cumulative-subset-of-a-vector-in-r/46303384#46303384
# the aim is to return the number of reads in a dataset (df)
# that comprise the largest subset of reads with an N50 of t
# we use this to calculte the number of 'ultra long' reads
# which are defined as those with N50 > 100KB
mid = floor(mean(c(min, max)))
if (mid == min) {
if (df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[min]/2))] < t) {
return(min - 1)
} else {
return(max - 1)
}
}
n = df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[mid]/2))]
if (n >= t) {
return(binSearch(mid, max, df))
} else {
return(binSearch(min, mid, df))
}
}
summary.stats <- function(d, Q_cutoff="All reads"){
# Calculate summary stats for a single value of min.q
rows = which(as.character(d$Q_cutoff)==Q_cutoff)
d = d[rows,]
d = d[with(d, order(-sequence_length_template)), ] # sort by read length, just in case
total.bases = sum(as.numeric(d$sequence_length_template))
total.reads = nrow(d)
N50.length = d$sequence_length_template[min(which(d$cumulative.bases > (total.bases/2)))]
mean.length = round(mean(as.numeric(d$sequence_length_template)), digits = 1)
median.length = round(median(as.numeric(d$sequence_length_template)), digits = 1)
max.length = max(as.numeric(d$sequence_length_template))
mean.q = round(mean(d$mean_qscore_template), digits = 1)
median.q = round(median(d$mean_qscore_template), digits = 1)
#calculate ultra-long reads and bases (max amount of data with N50>100KB)
ultra.reads = binSearch(1, nrow(d), d, t = 100000)
if(ultra.reads>=1){
ultra.gigabases = sum(as.numeric(d$sequence_length_template[1:ultra.reads]))/1000000000
}else{
ultra.gigabases = 0
}
reads = list(
reads.gt(d, 10000),
reads.gt(d, 20000),
reads.gt(d, 50000),
reads.gt(d, 100000),
reads.gt(d, 200000),
reads.gt(d, 500000),
reads.gt(d, 1000000),
ultra.reads)
names(reads) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong")
bases = list(
bases.gt(d, 10000)/1000000000,
bases.gt(d, 20000)/1000000000,
bases.gt(d, 50000)/1000000000,
bases.gt(d, 100000)/1000000000,
bases.gt(d, 200000)/1000000000,
bases.gt(d, 500000)/1000000000,
bases.gt(d, 1000000)/1000000000,
ultra.gigabases)
names(bases) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong")
return(list('total.gigabases' = total.bases/1000000000,
'total.reads' = total.reads,
'N50.length' = N50.length,
'mean.length' = mean.length,
'median.length' = median.length,
'max.length' = max.length,
'mean.q' = mean.q,
'median.q' = median.q,
'reads' = reads,
'gigabases' = bases
))
}
channel.summary <- function(d){
# calculate summaries of what happened in each of the channels
# of a flowcell
a = ddply(d, .(channel),
summarize,
total.bases = sum(sequence_length_template),
total.reads = sum(which(sequence_length_template>=0)),
mean.read.length = mean(sequence_length_template),
median.read.length = median(sequence_length_template))
b = melt(a, id.vars = c("channel"))
return(b)
}
flog.info(paste("Loading input file:", input.file))
## INFO [2017-12-11 18:05:31] Loading input file: example_input/RB7_A2/sequencing_summary.txt
d = load_summary(input.file, min.q=c(-Inf, q))
flowcell = unique(d$flowcell)
flog.info(paste(sep = "", flowcell, ": creating output directory:", output.dir))
## INFO [2017-12-11 18:05:44] RB7_A2: creating output directory:example_output/RB7_A2/minion_qc_report
dir.create(output.dir)
out.txt = file.path(output.dir, "summary.yaml")
flog.info(paste(sep = "", flowcell, ": summarising input file for flowcell"))
## INFO [2017-12-11 18:05:44] RB7_A2: summarising input file for flowcell
all.reads.summary = summary.stats(d, Q_cutoff = "All reads")
q10.reads.summary = summary.stats(d, Q_cutoff = q_title)
summary = list("input file" = input.file,
"All reads" = all.reads.summary,
cutoff = q10.reads.summary,
"notes" = 'ultralong reads refers to the largest set of reads with N50>100KB')
names(summary)[3] = q_title
write(as.yaml(summary), out.txt)
muxes = seq(from = 0, to = max(d$hour), by = 8)
Read length on a log10 scale (x axis) vs counts (y axis). Caution: doesn’t tell how much data (i.e.total yield) you have for reads above a given length though. For that, see the yield_summary plots.
# make plots
flog.info(paste(sep = "", flowcell, ": plotting length histogram"))
## INFO [2017-12-11 18:05:46] RB7_A2: plotting length histogram
p1 = ggplot(d, aes(x = sequence_length_template)) +
geom_histogram(bins = 300) +
scale_x_log10(minor_breaks=log10_minor_break()) +
facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") +
theme(text = element_text(size = 15)) +
xlab("Read length") +
ylab("Number of reads")
ggsave(filename = file.path(output.dir, "length_histogram.png"), width = 960/75, height = 960/75, plot = p1)
p1
Mean Q score for a read (x axis) vs counts (y axis). Q7 seems to be a good cut-off distinguishing ‘good’ and ‘bad’ reads, although bad reads might have useful info.
flog.info(paste(sep = "", flowcell, ": plotting mean Q score histogram"))
## INFO [2017-12-11 18:05:58] RB7_A2: plotting mean Q score histogram
p2 = ggplot(d, aes(x = mean_qscore_template)) +
geom_histogram(bins = 300) +
facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") +
theme(text = element_text(size = 15)) +
xlab("Mean Q score of read") +
ylab("Number of reads")
ggsave(filename = file.path(output.dir, "q_histogram.png"), width = 960/75, height = 960/75, plot = p2)
p2
Read length on a log10 scale (x axis) vs mean Q score (y axis). Points are coloured by the events per base. ‘Good’ reads are ~1.5 events per base, and ‘bad’ reads are >>1.5 events per base. We often see a group of very short, ‘bad’, low-quality reads, perhaps related to DNA extraction. In this plot, the point size, transperency, and plot size are always the same no matter the input data. This facilitates comparison of these plots among flowcells - those with more reads will look darker because there will be more points. If you have a 1D2 run, there will be no colours on this plot, because Albacore doesn’t report the number of events per read when it combines the two reads of a 1D2 run into a 2D read.
flog.info(paste(sep = "", flowcell, ": plotting read length vs. q score scatterplot"))
## INFO [2017-12-11 18:06:06] RB7_A2: plotting read length vs. q score scatterplot
p10 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = sequence_length_template, y = mean_qscore_template, colour = events_per_base)) +
geom_point(alpha=0.05, size = 0.4) +
scale_x_log10(minor_breaks=log10_minor_break()) +
labs(colour='Events per base\n(log scale)\n') +
theme(text = element_text(size = 15)) +
xlab("Read length") +
ylab("Mean Q score of read")
if(max(d$events_per_base, na.rm=T)>0){
# a catch for 1D2 runs which don't have events per base
p10 = p10 + scale_colour_viridis(trans = "log", labels = scientific, option = 'inferno')
}
ggsave(filename = file.path(output.dir, "length_vs_q.png"), width = 960/75, height = 960/75, plot = p10)
p10
The mean read length (y axis) over time (x axis). This let’s you see if you are running out of longer reads as the run progresses. Muxes, which occur every 8 hours, are shown as red dashed lines.
flog.info(paste(sep = "", flowcell, ": plotting sequence length over time"))
## INFO [2017-12-11 18:06:47] RB7_A2: plotting sequence length over time
e = subset(d, Q_cutoff=="All reads")
e$Q = paste(">=", q, sep="")
e$Q[which(e$mean_qscore_template<q)] = paste("<", q, sep="")
p7 = ggplot(e, aes(x=start_time/3600, y=sequence_length_template, colour = Q, group = Q)) +
geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
geom_smooth() +
xlab("Hours into run") +
ylab("Mean read length") +
ylim(0, NA)
ggsave(filename = file.path(output.dir, "length_by_hour.png"), width = 960/75, height = 480/75, plot = p7)
## `geom_smooth()` using method = 'gam'
p7
## `geom_smooth()` using method = 'gam'
The mean Q score (y axis) over time (x axis). Q scores drop noticably over time - presumably this is a result of the pores wearing out, or the DNA accumulating damage, or both. Muxes, which occur every 8 hours, are shown as red dashed lines.
flog.info(paste(sep = "", flowcell, ": plotting Q score over time"))
## INFO [2017-12-11 18:07:00] RB7_A2: plotting Q score over time
p8 = ggplot(e, aes(x=start_time/3600, y=mean_qscore_template, colour = Q, group = Q)) +
geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
geom_smooth() +
xlab("Hours into run") +
ylab("Mean Q score") +
ylim(0, NA)
ggsave(filename = file.path(output.dir, "q_by_hour.png"), width = 960/75, height = 480/75, plot = p8)
## `geom_smooth()` using method = 'gam'
p8
## `geom_smooth()` using method = 'gam'
The number of reads (y axis) obtained in each hour (x axis). Muxes (every 8 hours) are plotted in red dotted lines. You can typically see that each mux results in a noticable increase in the number of reads per hour.
flog.info(paste(sep = "", flowcell, ": plotting reads per hour"))
## INFO [2017-12-11 18:07:10] RB7_A2: plotting reads per hour
f = d[c("hour", "reads_per_hour", "Q_cutoff")]
f = f[!duplicated(f),]
g = subset(f, Q_cutoff=="All reads")
h = subset(f, Q_cutoff==q_title)
max = max(f$hour)
# all of this is just to fill in hours with no reads recorded
all = 0:max
add.g = all[which(all %in% g$hour == FALSE)]
if(length(add.g)>0){
add.g = data.frame(hour = add.g, reads_per_hour = 0, Q_cutoff = "All reads")
g = rbind(g, add.g)
}
add.h = all[which(all %in% h$hour == FALSE)]
if(length(add.h)>0){
add.h = data.frame(hour = add.h, reads_per_hour = 0, Q_cutoff = q_title)
h = rbind(h, add.h)
}
i = rbind(g, h)
i$Q_cutoff = as.character(i$Q_cutoff)
i$Q_cutoff[which(i$Q_cutoff==q_title)] = paste("Q>=", q, sep="")
p9 = ggplot(i, aes(x=hour, y=reads_per_hour, colour = Q_cutoff, group = Q_cutoff)) +
geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
geom_point() +
geom_line() +
xlab("Hours into run") +
ylab("Number of reads per hour") +
ylim(0, NA) +
scale_color_discrete(guide = guide_legend(title = "Reads"))
ggsave(filename = file.path(output.dir, "reads_per_hour.png"), width = 960/75, height = 480/75, plot = p9)
p9
The total yield (y axis) for any given minimum read length (x axis). This is just like the ‘reads’ table in the summary.yaml output, but done across all read lengths up to the read length that includes 99% of the total yield.
flog.info(paste(sep = "", flowcell, ": plotting flowcell yield summary"))
## INFO [2017-12-11 18:07:14] RB7_A2: plotting flowcell yield summary
p6 = ggplot(d, aes(x=sequence_length_template, y=cumulative.bases, colour = Q_cutoff)) +
geom_line(size = 1) +
xlab("Minimum read length") +
ylab("Total yield in bases") +
scale_colour_discrete(guide = guide_legend(title = "Reads")) +
theme(text = element_text(size = 15))
xmax = max(d$sequence_length_template[which(d$cumulative.bases > 0.01 * max(d$cumulative.bases))])
p6 = p6 + scale_x_continuous(limits = c(0, xmax))
ggsave(filename = file.path(output.dir, "yield_summary.png"), width = 960/75, height = 960/75, plot = p6)
p6
Histograms of total bases, total reads, mean read length, and median read length that show the variance across the 512 available channels. Repeated for all data and reads with Q>10.
flog.info(paste(sep = "", flowcell, ": plotting flowcell channels summary histograms"))
## INFO [2017-12-11 18:08:04] RB7_A2: plotting flowcell channels summary histograms
c = channel.summary(subset(d, Q_cutoff=="All reads"))
c10 = channel.summary(subset(d, Q_cutoff==q_title))
c$Q_cutoff = "All reads"
c10$Q_cutoff = q_title
cc = rbind(c, c10)
cc$variable = as.character(cc$variable)
cc$variable[which(cc$variable=="total.bases")] = "Number of bases per channel"
cc$variable[which(cc$variable=="total.reads")] = "Number of reads per channel"
cc$variable[which(cc$variable=="mean.read.length")] = "Mean read length per channel"
cc$variable[which(cc$variable=="median.read.length")] = "Median read length per channel"
p11 = ggplot(cc, aes(x = value)) + geom_histogram(bins = 30) +
facet_grid(Q_cutoff~variable, scales = "free_x") +
theme(text = element_text(size = 10))
ggsave(filename = file.path(output.dir, "channel_summary.png"), width = 2400/75, height = 960/75, plot = p11)
p11
The 512 channels are laid out as on the R9.5 flowcell. Each panel of the plot shows time on the x axis, and read length on the y axis. Points are coloured by the Q score. This gives a little insight into exactly what was going on in each of your channels over the course of the run. One thing of note in these plots is the frequent (and sometimes extended) periods in which some pores produce only very short, very low quality ‘reads’. Maybe due to residual contaminants in DNA extractions blocking the pores. A blocked pore looks like a change in current. And if the blockage is persistent (e.g. a large molecule just sitting blocking the pore, occasionally letting some current through) this could produce exactly this kind of pattern.
flog.info(paste(sep = "", flowcell, ": plotting flowcell overview"))
## INFO [2017-12-11 18:08:14] RB7_A2: plotting flowcell overview
p5 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x=start_time/3600, y=sequence_length_template, colour = mean_qscore_template)) +
geom_point(size=1.5, alpha=0.35) +
scale_colour_viridis() +
labs(colour='Q') +
scale_y_log10() +
facet_grid(row~col) +
theme(panel.spacing = unit(0.5, "lines")) +
xlab("Hours into run") +
ylab("Read length") +
theme(text = element_text(size = 40), axis.text.x = element_text(size=12), axis.text.y = element_text(size=12), legend.text=element_text(size=12))
ggsave(filename = file.path(output.dir, "flowcell_overview.png"), width = 2500/75, height = 2400/75, plot = p5)
p5
Same map but with events per base.
flog.info(paste(sep = "", flowcell, ": plotting flowcell overview events"))
## INFO [2017-12-11 18:10:15] RB7_A2: plotting flowcell overview events
p12 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x=start_time/3600, y=events_per_base, colour = mean_qscore_template)) +
geom_point(size=1.5, alpha=0.35) +
scale_colour_viridis(option = 'inferno') +
labs(colour='Q') +
scale_y_log10() +
facet_grid(row~col) +
theme(panel.spacing = unit(0.5, "lines")) +
xlab("Hours into run") +
ylab("Events per base") +
theme(text = element_text(size = 40), axis.text.x = element_text(size=12), axis.text.y = element_text(size=12), legend.text=element_text(size=12))
ggsave(filename = file.path(output.dir, "flowcell_overview_events.png"), width = 2500/75, height = 2400/75, plot = p12)
p12